﻿using System;
using System.Collections.Generic;
using System.Text;
using com.dz.ct.html;
using System.Text.RegularExpressions;
using com.dz.ct.common;
using System.Web;

namespace com.dz.ct.components.SmallSite.www.google.com
{
    public class SmallSiteObject
    {
        /// <summary>
        /// 默认章节字数要1000长度
        /// </summary>
        private int m_chapter_content_length = 1000;

        /// <summary>
        /// 从google收录章节
        /// </summary>
        /// <param name="i_book_name">小说名称</param>
        /// <param name="i_chapter_list">需要收录的vip章节列表</param>
        /// <returns></returns>
        public List<BookChapterInfo> CollectBookChapterList(string i_book_name,List<BookChapterInfo> i_chapter_list)
        {
            i_book_name = Regex.Replace(i_book_name, "[a-zA-Z0-9()（）]", "", RegexOptions.Compiled | RegexOptions.IgnoreCase);

            if (i_chapter_list == null || i_chapter_list.Count <= 0)
                return null;

            Encoding t_encoding = Encoding.GetEncoding("gb2312");

            string t_key_word = string.Format("小说{0}最新章节txt", i_book_name);

            string t_baidu_url = string.Format("http://www.baidu.com/s?wd={0}", HttpUtility.UrlEncode(t_key_word, t_encoding));

            string t_list_reg = "<h3\\s*?class=[\'\"]?t[\'\"]?><a[^<>]*?hrefs*=s*[\'\"]*([^\"\']*)[\'\"]*[^<>]*?>(.*?)</a>\\s*?</h3>";

            List<BookChapterInfo> t_need_collect_list = new List<BookChapterInfo>();

            List<BookChapterInfo> t_collect_chapter_list = new List<BookChapterInfo>();

            List<BookChapterInfo> t_vip_chapter_list = new List<BookChapterInfo>();

            string t_book_url = string.Empty;

            try
            {
                string t_html = NetSiteCatchManager.ReadUrl(t_baidu_url, t_encoding);

                if (!string.IsNullOrEmpty(t_html))
                {
                    MatchCollection t_ma = Regex.Matches(t_html, t_list_reg, RegexOptions.IgnoreCase | RegexOptions.Compiled);

                    if (t_ma != null)
                    {
                        for(int index=0;index<t_ma.Count;index++)
                        {
                            t_book_url = t_ma[index].Groups[1].Value.ToString();

                            t_html = NetSiteCatchManager.ReadUrl(t_book_url, Encoding.Default);

                            t_need_collect_list = GetNeedCollectChapter(i_chapter_list, t_vip_chapter_list);

                            t_collect_chapter_list = GetBookChapterList(t_book_url, t_html, t_need_collect_list, i_book_name);

                            if (t_collect_chapter_list != null && t_collect_chapter_list.Count > 0)
                                t_vip_chapter_list.AddRange(t_collect_chapter_list);

                            //就差10个章节退出
                            if (t_vip_chapter_list != null && t_vip_chapter_list.Count > 0 && i_chapter_list.Count-t_vip_chapter_list.Count<10)
                                break;

                        }
                    }
                }
            }
            catch (Exception ex)
            {
                LogHelper.Error("从google收录章节列表失败" + ex.ToString());
            }
            return t_vip_chapter_list;
        }

        /// <summary>
        /// 获取还没有收录到的章节列表
        /// </summary>
        /// <param name="i_vip_list"></param>
        /// <param name="i_have_collect_list"></param>
        /// <returns></returns>
        private List<BookChapterInfo> GetNeedCollectChapter(List<BookChapterInfo> i_vip_list, List<BookChapterInfo> i_have_collect_list)
        {
            if (i_have_collect_list == null || i_have_collect_list.Count <= 0)
                return i_vip_list;

            List<BookChapterInfo> t_list = new List<BookChapterInfo>();

            foreach (BookChapterInfo t_chapter in i_vip_list)
            {
                List<BookChapterInfo> t_temp = i_have_collect_list.FindAll(delegate(BookChapterInfo t_have_chapter)
                {
                    return t_chapter.ChapterName == t_have_chapter.ChapterName;
                });

                if (t_temp == null || t_temp.Count <= 0)
                {
                    t_list.Add(t_chapter);
                }
              
            }

            return t_list;
        }

        /// <summary>
        /// 获取章节列表
        /// </summary>
        /// <param name="i_html"></param>
        /// <param name="i_chapter_list"></param>
        /// <returns></returns>
        private List<BookChapterInfo> GetBookChapterList(string i_url,string i_html, List<BookChapterInfo> i_chapter_list,string i_book_name)
        {
            if (!NetSiteCatchManager.IsPiraticSite(i_url))
                return null;

            if (string.IsNullOrEmpty(i_html))
                return null;

            string t_chapter_name_reg = "<a[^<>]*?hrefs*=s*[\'\"]*([^\"\']*)[\'\"]*[^<>]*?>(.*?)</a>";

            List<BookChapterInfo> t_chapter_list = new List<BookChapterInfo>();

            BookChapterInfo t_chapter = null;

            bool t_is_stop = false;

            string t_chapter_url = string.Empty;

            try
            {
                MatchCollection t_ma = Regex.Matches(i_html, t_chapter_name_reg, RegexOptions.IgnoreCase | RegexOptions.Compiled);

                if (t_ma != null)
                {

                    foreach (BookChapterInfo t_ch in i_chapter_list)
                    {
                        foreach (Match t_mc in t_ma)
                        {
                            if (CompareChapterName(t_mc.Groups[2].Value.ToString().Trim(), t_ch.ChapterName) == true)
                            {
                                t_chapter_url = NetSiteCatchManager.GetFullUrl(i_url, t_mc.Groups[1].Value.ToString().Trim());

                                if (string.IsNullOrEmpty(t_chapter_url))
                                {
                                    t_is_stop = true;
                                    break;
                                }

                                t_chapter = GetBookChapter(t_chapter_url, t_ch, i_book_name);

                                if (t_chapter == null)
                                {
                                    t_is_stop = true;
                                    break;
                                }

                                if (t_chapter != null)
                                    t_chapter_list.Add(t_chapter);

                                break;
                            }
                        }

                        if (t_is_stop)
                            break;
                    }

                   
                }

                return t_chapter_list;

            }
            catch (Exception ex)
            {
                LogHelper.Error("从百度分离章节名称失败" + ex.ToString());
                return null;
            }
          
        }


        /// <summary>
        /// 得到章节信息
        /// </summary>
        /// <param name="i_url"></param>
        /// <param name="i_chapter_name"></param>
        /// <param name="i_chapter_list"></param>
        /// <returns></returns>
        private BookChapterInfo GetBookChapter(string i_url, BookChapterInfo i_chapter, string i_book_name)
        {
            //最后一个章节不一定有1000字
            if (i_chapter.ChapterName.IndexOf("完") > -1 || i_chapter.ChapterName.IndexOf("终") > -1 || i_chapter.ChapterName.IndexOf("结") > -1)
            {
                m_chapter_content_length = 300;
            }
            else
            {
                m_chapter_content_length = 1000;
            }

            BookChapterInfo t_chapter_info=null;
     
            string t_chapter_content = GetContent(i_url, i_chapter.ChapterName);

            t_chapter_content = NetSiteCatchManager.ReplaceContent(t_chapter_content);

            if (string.IsNullOrEmpty(t_chapter_content) || t_chapter_content.Length < m_chapter_content_length)
            {
                t_chapter_content = GetChapterContentByChapterName(i_book_name, i_chapter.ChapterName);

                t_chapter_content = NetSiteCatchManager.ReplaceContent(t_chapter_content);

                if (string.IsNullOrEmpty(t_chapter_content) || t_chapter_content.Length < m_chapter_content_length)
                    return null;
            }

            t_chapter_content = string.Format("document.write('{0}');", t_chapter_content);
            t_chapter_info = new BookChapterInfo();
            t_chapter_info.ChapterName = i_chapter.ChapterName;
            t_chapter_info.ChapterContent = t_chapter_content;
            t_chapter_info.WordsCount = t_chapter_content.Length;
            t_chapter_info.Comfrom = i_url;
            t_chapter_info.IsVip = i_chapter.IsVip;
            t_chapter_info.UpdateTime = i_chapter.UpdateTime;
            t_chapter_info.VolumeName = i_chapter.VolumeName;
            t_chapter_info.BookId = i_chapter.BookId;
            t_chapter_info.SiteId = i_chapter.SiteId;

            return t_chapter_info;
        }

        /// <summary>
        /// 获取章节内容
        /// </summary>
        /// <param name="i_url"></param>
        /// <param name="i_chapter_name"></param>
        /// <returns></returns>
        private string GetContent(string i_url, string i_chapter_name)
        {

            Encoding t_encoding = Encoding.Default;

            string t_chapter_content = string.Empty;
   
            string t_charset = string.Empty;

            try
            {
                string t_html = NetSiteCatchManager.ReadUrl(i_url, t_encoding);

                if (string.IsNullOrEmpty(t_html))
                {
                    //重复一次
                    t_html = NetSiteCatchManager.ReadUrl(i_url, t_encoding);
                }

                t_chapter_content = GetChapterContent(t_html);

                return t_chapter_content;

            }
            catch (Exception ex)
            {
                LogHelper.Error("获取页面内容失败" + ex.ToString());
          
                return string.Empty;
            }
        }


        /// <summary>
        /// 获取html章节内容
        /// </summary>
        /// <param name="i_html"></param>
        /// <param name="i_chapter_name"></param>
        /// <returns></returns>
        private string GetChapterContent(string i_html)
        {
            HtmlDocument t_html_doc = HtmlDocument.Create(i_html);

            string t_content = string.Empty;

            string t_temp_content = string.Empty;

            foreach (HtmlElement t_ele in t_html_doc.GetElementsByTagName("td"))
            {
                t_temp_content = t_ele.InnerText;
                t_temp_content = Regex.Replace(t_temp_content, "<.*?>.*?</.*?>", "", RegexOptions.IgnoreCase | RegexOptions.Compiled);
                t_temp_content = Regex.Replace(t_temp_content, "[a-zA-Z0-9]", "", RegexOptions.IgnoreCase | RegexOptions.Compiled);
                if (t_temp_content.Length > m_chapter_content_length)
                {
                    t_content = t_ele.HTML;
                }

            }
            if (!string.IsNullOrEmpty(t_content))
                return t_content;

            foreach (HtmlElement t_ele in t_html_doc.GetElementsByTagName("div"))
            {
                t_temp_content = t_ele.InnerText;
                t_temp_content = Regex.Replace(t_temp_content, "<.*?>.*?</.*?>", "", RegexOptions.IgnoreCase | RegexOptions.Compiled);
                t_temp_content = Regex.Replace(t_temp_content, "[a-zA-Z0-9,\\/;_()]", "", RegexOptions.IgnoreCase | RegexOptions.Compiled);
                if (t_temp_content.Length > m_chapter_content_length)
                {
                    t_content = t_ele.HTML;
                }

            }

            if (string.IsNullOrEmpty(t_content) || t_content.Length < m_chapter_content_length)
                t_content = GetContentByReg(i_html);

            if (t_content.Length < m_chapter_content_length)
                return string.Empty;

            return t_content;
        }

        /// <summary>
        /// 用正则表达式获取章节内容
        /// </summary>
        /// <param name="i_html"></param>
        /// <returns></returns>
        private string GetContentByReg(string i_html)
        {
            StringBuilder t_sb = new StringBuilder();

            string t_reg = "([\u4E00-\u9FA5][^<>]*[\u4E00-\u9FA5])";

            MatchCollection t_ma = Regex.Matches(i_html, t_reg, RegexOptions.IgnoreCase | RegexOptions.Compiled);

            string t_sub_html = string.Empty;


            int t_start_index = 0;

            int t_length = 0;
           
            if (t_ma != null)
            {
                 int t_total_count=t_ma.Count;

                 for (int index = 0; index < t_total_count-1; index++)
                 {
                     t_start_index = t_ma[index].Index + t_ma[index].Groups[1].Value.ToString().Length;

                     t_length = t_ma[index + 1].Index - t_ma[index].Index - t_ma[index].Groups[1].Value.ToString().Length;

                     t_sub_html = i_html.Substring(t_start_index, t_length);

                     t_sub_html = Regex.Replace(t_sub_html, "&nbsp;", "", RegexOptions.IgnoreCase | RegexOptions.Compiled);

                     t_sub_html = Regex.Replace(t_sub_html, "<[/]*p[^<>]*>", "", RegexOptions.IgnoreCase | RegexOptions.Compiled);

                     t_sub_html = Regex.Replace(t_sub_html, "<[/]*br>", "", RegexOptions.IgnoreCase | RegexOptions.Compiled);

                     t_sub_html=Regex.Replace(t_sub_html, "[【】（），！？(),!?;；、……]", "", RegexOptions.IgnoreCase | RegexOptions.Compiled);

                     if (t_sub_html.Length < 10)
                     {
                         t_sb.Append(t_ma[index].Groups[1].Value.ToString());
                         t_sb.Append("<p>&nbsp;&nbsp;&nbsp;&nbsp;");
                         
                     }
                     
                 }
             
            }

            return t_sb.ToString();
        }

        /// <summary>
        /// 判断是否是相同的章节
        /// </summary>
        /// <param name="i_chapter_source"></param>
        /// <param name="i_chapter_target"></param>
        /// <returns></returns>
        private bool CompareChapterName(string i_chapter_source, string i_chapter_target)
        {
            if (i_chapter_source.Equals(i_chapter_target))
                return true;

            //去掉空格
            i_chapter_source = Regex.Replace(i_chapter_source, "[\\s【】（），！？(),!?;\\.；、/……]", "", RegexOptions.IgnoreCase | RegexOptions.Compiled);
           
            i_chapter_target = Regex.Replace(i_chapter_target, "[\\s【】（），！？(),!?;；\\.、/……]", "", RegexOptions.IgnoreCase | RegexOptions.Compiled);

            if (i_chapter_source.IndexOf(i_chapter_target) > -1 || i_chapter_target.IndexOf(i_chapter_source) > -1)
                return true;

            return false;
        }

        /// <summary>
        /// 通过章节名称去搜索引擎收录
        /// </summary>
        /// <param name="i_book_name"></param>
        /// <param name="i_chapter_name"></param>
        /// <returns></returns>
        private string GetChapterContentByChapterName(string i_book_name, string i_chapter_name)
        {
            string t_key_word=i_chapter_name;

            //章节名称长度小于5加上书名作为关键字
            if (i_chapter_name.Length < 5)
            {
                t_key_word = string.Format("{0} {1}", i_book_name, i_chapter_name);
            }

            Encoding t_encoding = Encoding.GetEncoding("gb2312");

            string t_baidu_url = string.Format("http://www.baidu.com/s?wd={0}", HttpUtility.UrlEncode(t_key_word, t_encoding));

            string t_list_reg = "<h3\\s*?class=[\'\"]?t[\'\"]?><a[^<>]*?hrefs*=s*[\'\"]*([^\"\']*)[\'\"]*[^<>]*?>(.*?)</a>\\s*?</h3>";

            string t_chapter_url = string.Empty;

            string t_chapter_content = string.Empty;

            try
            {
                string t_html = NetSiteCatchManager.ReadUrl(t_baidu_url, t_encoding);

                if (!string.IsNullOrEmpty(t_html))
                {
                    MatchCollection t_ma = Regex.Matches(t_html, t_list_reg, RegexOptions.IgnoreCase | RegexOptions.Compiled);

                    if (t_ma != null)
                    {
                        foreach (Match t_mc in t_ma)
                        {
                            t_chapter_url = t_mc.Groups[1].Value.ToString();

                            t_html = NetSiteCatchManager.ReadUrl(t_chapter_url, Encoding.Default);

                            if (string.IsNullOrEmpty(t_html))
                            {
                                //重复一次
                                t_html = NetSiteCatchManager.ReadUrl(t_chapter_url, Encoding.Default);

                                if (NetSiteCatchManager.IsContainChapterName(i_book_name, i_chapter_name, t_html) == false)
                                    continue;

                                t_chapter_content = GetChapterContent(t_html);

                                if (!string.IsNullOrEmpty(t_chapter_content) && t_chapter_content.Length > m_chapter_content_length)
                                    break;
                            }
                        }
                    }
                }

                return t_chapter_content;
            }
            catch (Exception ex)
            {
                LogHelper.Error("根据章节名称收录章节失败" + ex.ToString());
                return string.Empty;
            }
        }


    }
}
